1 Overview

This R Markdown document reproduces the main analyses and figures for tumor evolution in high-clonality LUAD (Lung Adenocarcinoma) dataset from the Sherlock-Lung study

manuscript title: Deciphering lung adenocarcinoma evolution and the role of LINE-1 retrotransposition

Fig. 2: Features associated with lung tumor latency.


2 Load Required Libraries and Set Plotting Styles

(You may need to install some packages if missing)

library(tidyverse)
library(ggplot2)
library(ggsankey)
library(scales)
library(hrbrthemes)   # For theme_ipsum_rc
library(cowplot)
library(forcats)
library(ggrepel)
library(ggnewscale)
library(data.table)
library(ggasym)
library(ggpmisc)
library(broom)
library(ggsci)
library(ggpubr)

# --- Define Helper Functions (if any custom) -----------------------------------
# Please source your custom functions here if needed, or place them in ./functions/
# source('./functions/your_custom_functions.R')

# --- Load Input Datasets ------------------------------------------------------
# All data files should be placed in the appropriate subfolders.
# Example folder structure:
#   ./data/           - for main input files
#   ./functions/      - for custom R functions
#   ./output/         - for saving plots and results

# Replace the filenames below with your actual dataset filenames.
load('./data/BBsolution_final3_short.RData',verbose = T)
## Loading objects:
##   BBsolution
##   BBsolution4
##   hq_samples
##   wgs_groups_info
##   wgs_groups_info2
load('./data/sp_group_data.RData',verbose = T)
## Loading objects:
##   sp_group_color
##   sp_group_color_new
##   sp_group_data
##   sp_group_data2
load('./data/covdata0.RData',verbose = T)
## Loading objects:
##   covdata0
load('./data/clinical_data.RData',verbose = T)
## Loading objects:
##   clinical_data
load('./data/sherlock_data_all.RData',verbose = T)
## Loading objects:
##   sherlock_data
##   sherlock_data_full
##   sherlock_overall
##   sherlock_type_colors
##   sherlock_freq
##   sherlock_freq_hq
load('./data/sherlock_variable.RData',verbose = T)
## Loading objects:
##   sherlock_variable
load('./data/sp_group_data.RData',verbose = T)
## Loading objects:
##   sp_group_color
##   sp_group_color_new
##   sp_group_data
##   sp_group_data2
load('./data/id2data.RData',verbose = T)
## Loading objects:
##   id2data
##   id2color
load('./data/tedata.RData',verbose = T)
## Loading objects:
##   tedata
load('./data/RNASeq_Exp.RData',verbose = T)
## Loading objects:
##   cdata3
##   rdata1
##   adata
load('./data/suvdata.RData',verbose = T)
## Loading objects:
##   suvdata
# load function -----------------------------------------------------------
load('./data/ZTW_functions.RData',verbose = T)
## Loading objects:
##   barcode2spg
##   sp_group_color
##   sp_group_color_new
##   sp_group_lift
##   sp_group_data
##   sp_group_major_new
##   sp_group_major
# load analysis related data set
load('./data/Chronological_timing.RData',verbose = T)
## Loading objects:
##   mrate
##   MRCAdata
##   subclonesTimeAbs
##   subclonesTimeAbsLinear
##   finalPloidy
##   finalPower
##   effGenome
##   finalBB
##   finalSnv
##   finalClusters
##   remove
##   branchDeam
##   qRateDeam
##   rateDeam
##   timingclass_groups
##   timingInfo
tmp <- colnames(MRCAdata)
MRCAdata <- MRCAdata %>% select(-SP_Group) %>% left_join(sp_group_data2) %>% select(-SP_Group_New) %>% select(one_of(tmp))

## analysis limited to luad only
hq_samples2 <- covdata0 %>% filter(Histology == 'Adenocarcinoma', Tumor_Barcode %in% hq_samples) %>% pull(Tumor_Barcode)
rm(list=c('hq_samples'))

3 Original codes for performing association analysis for all genomic variables with latency variables

#  Latency difference among all genome alteration or features
tdata <- MRCAdata %>% filter(!is.na(Latency)) %>% select(Tumor_Barcode,Acc=acceleration,Latency) #Histology == "Adenocarcinoma"; acceleration == '1x'
#tdata <- MRCAdata %>% filter(!is.na(Latency),acceleration == '1x',SP_Group %in% c('N_A','N_U')) %>% select(Tumor_Barcode,Latency) #Histology == "Adenocarcinoma"

## wicox test ## 
tdata <- sherlock_data_full %>% 
  filter(!(Type %in% c('Signature_Denovo'))) %>% 
  filter(Tumor_Barcode %in% tdata$Tumor_Barcode) %>% 
  mutate(Gene=paste0(Gene,"|",Type)) %>% 
  select(Tumor_Barcode,Gene,Alteration) %>% 
  left_join(tdata) %>% 
  #left_join(freq_mrca %>% dplyr::rename(Gene=name))  %>% 
  left_join(sp_group_data2)
#filter(Freq>0.03)

tdata_all <- bind_rows(
  tdata,
  tdata %>% mutate(SP_Group='ALL',SP_Group_New='ALL'),
  tdata %>% filter(SP_Group_New %in% c('EU_N','AS_N')) %>% mutate(SP_Group='Non-smokers',SP_Group_New='Non-smokers')
) 

#tdata <- tdata %>% filter(Freq > 0.03)

tresult_all <- tdata_all %>%
  group_by(SP_Group_New,Acc,Gene) %>%
  do(tresult = safely(wilcox.test)(Latency ~ Alteration, data=. )) %>%
  mutate(tresult_null = map_lgl(tresult['result'], is.null)) %>%
  filter(!tresult_null) %>%
  mutate(fit = list(tidy(tresult[['result']]))) %>%
  select(SP_Group_New,Acc,Gene,fit) %>%
  unnest(cols = c(fit)) %>%
  ungroup() %>%
  arrange(p.value) %>%
  mutate(TMP=Gene) %>%
  separate(col = 'TMP',into = c('Gene_short','Type'),sep = '\\|') %>%
  ungroup()

# add diff
tdata_all <- tdata_all %>% mutate(Alteration = case_when(
  Alteration %in% c("Female","Y","Yes","Non-Smoker","WGD") ~ "Yes",
  Alteration %in% c("Male","N","No","Smoker","nWGD") ~ "No",
  TRUE ~ NA_character_
)) 
tmp <- tdata_all %>% filter(!is.na(Alteration)) %>% group_by(SP_Group_New,Acc,Gene,Alteration) %>% summarise(value=median(Latency,na.rm = T)) %>% ungroup() %>% pivot_wider(names_from = 'Alteration',values_from = value) %>% mutate(diff=Yes-No)

# add freq
tmpsize <- tdata_all %>% drop_na() %>% select(SP_Group_New,Acc,Tumor_Barcode,Gene) %>% unique() %>% group_by(SP_Group_New,Acc) %>% count(Gene) %>% dplyr::rename(size=n)
freq_mrca <- tdata_all %>% drop_na()  %>% count(SP_Group_New,Acc,Gene,Alteration) %>% filter(!is.na(Alteration)) %>% left_join(tmpsize)%>% mutate(Freq=n/size) %>% group_by(SP_Group_New,Acc,Gene) %>% arrange(Freq) %>% dplyr::slice(1) %>% ungroup() %>% select(SP_Group_New,Acc,Gene,Freq) %>% mutate(Freq=if_else(Freq==1,0,Freq))

# tmpsize <- mdata %>% select(Tumor_Barcode,name) %>% unique()  %>% count(name) 
# freq_mrca <- mdata %>% count(name,value) %>% filter(!is.na(value)) %>% left_join(tmpsize)%>% mutate(Freq=n/size) %>% group_by(name) %>% arrange(Freq) %>% dplyr::slice(1) %>% ungroup() %>% select(name,Freq) %>% mutate(Freq=if_else(Freq==1,0,Freq))

tresult_all <- tresult_all %>% left_join(tmpsize) %>% left_join(tmp) %>% left_join(freq_mrca)

#saveRDS(tresult_all,file='latency_tresult.RDS')
tdata_all <- tdata_all %>% 
  mutate(TMP=Gene) %>%
  separate(col = 'TMP',into = c('Gene_short','Type'),sep = '\\|')

save(tresult_all,tdata_all,file='latency_tresult.RData')

4 Fig. 2d - Latency association with mutational signatures

Associations between tumor latency and the presence of specific mutational signatures. The vertical blue dashed line represents a 5-year latency difference between tumors with and without each mutational signature, and the horizontal line indicates the significance threshold (FDR<0.05).

# Signatures Only
load('./data/latency_tresult.RData')
resulttmp <- tresult_all %>% 
  filter(Type=='Signature_Cosmic_final',!str_detect(Gene,'APOBEC')) %>% 
  filter(Acc=='1x') %>% 
  filter(SP_Group_New=='ALL') %>% 
  filter(Freq>0.01,Gene_short!='SBS5') %>% 
  mutate(Sig=if_else(str_detect(Gene,'^SBS'),'SBS',if_else(str_detect(Gene,'^DBS'),'DBS',if_else(str_detect(Gene,'^ID'),'ID',NA_character_)))) %>% 
  mutate(Sig=factor(Sig,levels=c('SBS','ID','DBS'))) %>% 
  group_by(Sig) %>% 
  mutate(FDR=p.adjust(p.value)) %>% 
  ungroup()

#filter(p.value<0.1)
#filter(str_detect(Gene_short,'^SBS'))

datatmp <- tdata_all %>% 
  filter(Type=='Signature_Cosmic_final') %>% 
  filter(Acc=='1x') %>% 
  filter(SP_Group_New=='ALL') %>% 
  filter(Gene_short %in% resulttmp$Gene_short) %>% 
  mutate(Sig=if_else(str_detect(Gene,'^SBS'),'SBS',if_else(str_detect(Gene,'^DBS'),'DBS',if_else(str_detect(Gene,'^ID'),'ID',NA_character_)))) %>% 
  mutate(Sig=factor(Sig,levels=c('SBS','ID','DBS')))

#tmp <- datatmp %>% count(Gene,Gene_short,Type,Alteration) %>% filter(n>5) %>% count(Gene,Gene_short,Type) %>% filter(n==2) %>% pull(Gene)
#datatmp <- datatmp %>% filter(Gene %in% tmp)
#resulttmp <- resulttmp %>% filter(Gene %in% tmp)

tmplevel <- datatmp %>% 
  filter(Alteration == "Yes") %>% 
  group_by(Gene_short) %>% 
  summarise(value=median(Latency)) %>% 
  arrange(value) %>% 
  pull(Gene_short) 

resulttmp <- resulttmp %>% mutate(Gene_short = factor(Gene_short,levels = tmplevel)) 
datatmp <- datatmp %>% mutate(Gene_short = factor(Gene_short,levels = tmplevel)) 

# barplot 
p1 <- datatmp %>% 
  #filter(Gene_short=='ID-Novel-C') %>% 
  ggplot(aes(Gene_short,Latency,fill=Alteration))+
  geom_point(pch=21,size=1,position=position_jitterdodge(jitter.width = 0.1,dodge.width = 0.5),color="black")+
  geom_boxplot(width=0.5,outlier.shape = NA,alpha =0.25)+
  scale_fill_manual(values = c("Yes" = "#4daf4a","No" = "#cccccc"))+
  #scale_fill_manual(values = sigcol)+
  theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14,grid = "Y",grid_col = "gray90",ticks = T)+
  theme(axis.text.x = element_text(angle = 30,hjust = 1,vjust = 1),legend.position = 'top',panel.spacing.x = unit(0.1,'cm'))+
  labs(x = "", y = 'Latency, years before diagnosis')+
  scale_y_continuous(breaks = pretty_breaks())+ #,limits = c(-1.5,32)
  #guides(fill="none")+
  facet_grid(.~Sig,scales = 'free',space='free')+
  panel_border(color = 'black',linetype = 1)
p1

p2 <- resulttmp %>%
  ggplot(aes(Gene_short,"1",fill=-log10(p.value)))+
  geom_tile()+
  facet_grid(.~Sig,scales = 'free',space='free')+
  scale_fill_viridis_c(option = "D")+
  theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14,grid = FALSE,grid_col = "gray90",ticks = F)+
  theme(axis.text.x = element_blank(),axis.text.y = element_blank(),legend.position = 'top',legend.key.width = unit(1,"cm"),legend.key.height = unit(0.3,"cm"),panel.spacing.x = unit(0.1,'cm'))+
  labs(x = "", y = '',fill="Wilcox test, -log10(P)\n")
#guides(fill="none")+
# panel_border(color = 'black',linetype = 1)
p2

#plot_grid(p2,p1,axis = 'v',align = 'lr',ncol = 1,rel_heights = c(1.1,4))

##ggsave(filename = './output/latency_signatures.pdf',width = 16,height = 10,device = cairo_pdf)

## vacanol plot

resulttmp %>% 
  mutate(Sig=as.factor(Sig)) %>% 
  ggplot(aes((diff),-log10(FDR),fill=Sig))+
  geom_hline(yintercept = -log10(0.05),linetype=2,col='red',size=0.5)+
  geom_vline(xintercept = c(-5,5),linetype=2,col='blue',size=0.5)+
  geom_point(aes(size=Freq),pch=21,stroke=0.2)+
  scale_size_binned(labels = percent_format())+
  scale_fill_manual(values = ncicolpal[c(1,3,4)])+
  scale_x_continuous(breaks = pretty_breaks())+
  ggrepel::geom_text_repel(data=resulttmp,aes(label=Gene_short),max.overlaps = 30,size=3)+
  labs(x='Latency Change (Years)',y='-log10(FDR)',size='Frequency',fill='Signature')+
  guides(fill = guide_legend(override.aes = list(size=3.5)))+
  theme_ipsum_rc(base_size = 12,axis_title_just = 'm',axis_title_size = 14,grid='XY',ticks = T)+
  panel_border(color = 'black',size = 0.5)+
  coord_cartesian(clip = 'off')

#ggsave(filename = './output/latency_fdr_signatures.pdf',width = 7,height = 5.5,device = cairo_pdf)

5 Fig. 2a - Latency association with driver genes

Associations between tumor latency and driver gene mutation status. The vertical blue dashed line indicates a 5-year latency difference between driver gene wild type and mutant groups, and the horizontal line represents the significance threshold (FDR<0.05).

# Driver Mutations
#load('latency_tresult.RData')
drglist <- readRDS('./data/drivers_intogene.RDS') %>% pull(symbol)

resulttmp <- tresult_all %>% 
  filter(Type=='Mutation_Driver',Gene_short %in% drglist) %>% 
  filter(Acc=='1x') %>% 
  filter(SP_Group_New=='ALL') %>% 
  filter(Freq>0.01) %>% 
  mutate(FDR=p.adjust(p.value)) 

#filter(p.value<0.1)
#filter(str_detect(Gene_short,'^SBS'))

datatmp <- tdata_all %>% 
  filter(Type=='Mutation_Driver',Gene_short %in% drglist) %>% 
  filter(Acc=='1x') %>% 
  filter(SP_Group_New=='ALL') %>% 
  filter(Gene_short %in% resulttmp$Gene_short)

#tmp <- datatmp %>% count(Gene,Gene_short,Type,Alteration) %>% filter(n>5) %>% count(Gene,Gene_short,Type) %>% filter(n==2) %>% pull(Gene)
#datatmp <- datatmp %>% filter(Gene %in% tmp)
#resulttmp <- resulttmp %>% filter(Gene %in% tmp)

tmplevel <- datatmp %>% 
  filter(Alteration == "Yes") %>% 
  group_by(Gene_short) %>% 
  summarise(value=median(Latency)) %>% 
  arrange(value) %>% 
  pull(Gene_short) 

resulttmp <- resulttmp %>% mutate(Gene_short = factor(Gene_short,levels = tmplevel)) 
datatmp <- datatmp %>% mutate(Gene_short = factor(Gene_short,levels = tmplevel)) 
## vacanol plot

resulttmp %>% 
  ggplot(aes((diff),-log10(FDR)))+
  geom_hline(yintercept = -log10(0.05),linetype=2,col='red',size=0.5)+
  geom_vline(xintercept = c(-5,5),linetype=2,col='blue',size=0.5)+
  geom_point(aes(size=Freq),pch=21,stroke=0.1,fill='#3D4551',col='white')+
  scale_size_binned(labels = percent_format())+
  scale_x_continuous(breaks = pretty_breaks())+
  ggrepel::geom_text_repel(data=resulttmp %>% filter(FDR<0.2),aes(label=Gene_short),max.overlaps = 30,size=4)+
  labs(x='Latency Change (Years)',y='-log10(FDR)',size='Mutation frequency',fill='Signature')+
  guides(fill = guide_legend(override.aes = list(size=3.5)))+
  theme_ipsum_rc(base_size = 12,axis_title_just = 'm',axis_title_size = 14,grid='XY',ticks = T)+
  theme(legend.position = 'top',legend.key.width = unit(1,'cm'))+
  panel_border(color = 'black',size = 0.5)+
  coord_cartesian(clip = 'off')

resulttmp %>% 
  ggplot(aes((diff),-log10(p.value)))+
  geom_hline(yintercept = -log10(0.001),linetype=2,col='red',size=0.5)+
  geom_hline(yintercept = -log10(0.05),linetype=2,col='orange',size=0.5)+
  geom_vline(xintercept = c(-5,5),linetype=2,col='blue',size=0.5)+
  geom_point(aes(size=Freq),pch=21,stroke=0.1,fill='#3D4551',col='white')+
  scale_size_binned(labels = percent_format())+
  scale_x_continuous(breaks = pretty_breaks())+
  scale_y_continuous(breaks = pretty_breaks())+
  ggrepel::geom_text_repel(data=resulttmp %>% filter(p.value<0.05),aes(label=Gene_short),max.overlaps = 30,size=3.5)+
  labs(x='Latency Change (Years)',y='-log10(FDR)',size='Mutation frequency',fill='Signature')+
  guides(fill = guide_legend(override.aes = list(size=3.5)))+
  theme_ipsum_rc(base_size = 12,axis_title_just = 'm',axis_title_size = 14,grid='XY',ticks = T)+
  panel_border(color = 'black',size = 0.5)+
  coord_cartesian(clip = 'off')

#ggsave(filename = './output/latency_driverGene_signatures.pdf',width = 5,height = 5.5,device = cairo_pdf)

## for EGFR
tmp <- resulttmp <- tresult_all %>% 
  filter(Type=='Mutation_Driver',Gene_short %in% 'EGFR') %>% 
  filter(Acc=='1x') %>% 
  filter(SP_Group_New %in% c('AS_N','EU_N','EU_S',"Others")) %>% 
  mutate(label=paste0(SP_Group_New,'\nP=',scientific_format()(p.value))) %>%  select(SP_Group_New,label)

tdata_all %>% 
  filter(Type=='Mutation_Driver',Gene_short %in% 'EGFR') %>% 
  filter(Acc=='1x') %>% 
  filter(SP_Group_New %in% c('AS_N','EU_N','EU_S',"Others")) %>% 
  mutate(Aternation=factor(Alteration,levels=c('Yes','No'))) %>% 
  left_join(tmp) %>% 
  ggplot(aes(Alteration,Latency,fill=Alteration))+
  geom_point(pch=21,size=2,position=position_jitterdodge(jitter.width = 0.2,dodge.width = 0.5),color="black")+
  geom_boxplot(width=0.5,outlier.shape = NA,alpha =0.25)+
  scale_fill_manual(values = c("Yes" = "#4daf4a","No" = "#cccccc"))+
  #scale_fill_manual(values = sigcol)+
  theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14,grid = "Y",grid_col = "gray90",ticks = T)+
  theme(axis.text.x = element_text(angle = 30,hjust = 1,vjust = 1),legend.position = 'bottom',panel.spacing.x = unit(0.1,'cm'),strip.text.x = element_text(hjust = 0.5,face = 2))+
  labs(x = "", y = 'Latency, years before diagnosis',fill='EGFR Mutation')+
  scale_y_continuous(breaks = pretty_breaks())+ #,limits = c(-1.5,32)
  #guides(fill="none")+
  facet_grid(.~label,scales = 'free',space='free')+
  panel_border(color = 'black',linetype = 1)

#ggsave(filename = './output/latency_driverGene_signatures2.pdf',width = 6,height = 6,device = cairo_pdf)


## for KRAS
tmp <- resulttmp <- tresult_all %>% 
  filter(Type=='Mutation_Driver',Gene_short %in% 'KRAS') %>% 
  filter(Acc=='1x') %>% 
  filter(SP_Group_New %in% c('AS_N','EU_N','EU_S',"Others")) %>% 
  mutate(label=paste0(SP_Group_New,'\nP=',scientific_format()(p.value))) %>%  select(SP_Group_New,label)


tdata_all %>% 
  filter(Type=='Mutation_Driver',Gene_short %in% 'KRAS') %>% 
  filter(Acc=='1x') %>% 
  filter(SP_Group_New %in% c('AS_N','EU_N','EU_S','Others')) %>% 
  mutate(Aternation=factor(Alteration,levels=c('Yes','No'))) %>% 
  left_join(tmp) %>% 
  ggplot(aes(Alteration,Latency,fill=Alteration))+
  geom_point(pch=21,size=2,position=position_jitterdodge(jitter.width = 0.2,dodge.width = 0.5),color="black")+
  geom_boxplot(width=0.5,outlier.shape = NA,alpha =0.25)+
  scale_fill_manual(values = c("Yes" = "#4daf4a","No" = "#cccccc"))+
  #scale_fill_manual(values = sigcol)+
  theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14,grid = "Y",grid_col = "gray90",ticks = T)+
  theme(axis.text.x = element_text(angle = 30,hjust = 1,vjust = 1),legend.position = 'bottom',panel.spacing.x = unit(0.1,'cm'),strip.text.x = element_text(hjust = 0.5,face = 2))+
  labs(x = "", y = 'Latency, years before diagnosis',fill='KRAS Mutation')+
  scale_y_continuous(breaks = pretty_breaks())+ #,limits = c(-1.5,32)
  #guides(fill="none")+
  facet_grid(.~label,scales = 'free',space='free')+
  panel_border(color = 'black',linetype = 1)

#ggsave(filename = './output/latency_driverGene_signatures_kras.pdf',width = 6,height = 6,device = cairo_pdf)

# # example of regression
# tdata_all %>% 
#   filter(Type=='Mutation_Driver',Gene_short %in% 'EGFR') %>% 
#   filter(Acc=='1x') %>% 
#   left_join(covdata0) %>% 
#   filter(SP_Group_New %in% c('AS_N','EU_N','EU_S')) %>%
#   group_by(SP_Group_New) %>% 
#   do(tidy(lm(Latency~Alteration+Gender+Histology+Tumor_Purity,data=.))) %>% 
#   filter(term=='AlterationYes') %>% 
#   arrange(p.value)
# #do(tidy(t.test(Latency~Alteration,data=.)))

6 Fig. 2b-c - Latency association with EGFR, Gender, Sex

Figure 2b: Forest plot illustrating associations between tumor latency and EGFR mutation status, adjusted for sex and tumor purity (percentage of cancer cells within a tumor sample). P-values and regression coefficients with 95% confidence intervals (CIs) are provided for each variable. Significant associations are in red.

Figure 2c: Box plots displaying estimated tumor latency separated by EGFR mutation status and sex.

tmp <- sherlock_data_full %>% 
  filter(Type=='Mutation_Driver',Gene=='EGFR') %>% 
  select(Tumor_Barcode,EGFR=Alteration)

tdata <- MRCAdata %>% 
  filter(Tumor_Barcode %in% hq_samples2) %>% 
  filter(acceleration=='1x', Tumor_Barcode %in% hq_samples2) %>%
  left_join(sp_group_data2) %>% 
  left_join(covdata0 %>% select(Tumor_Barcode,Gender,Tumor_Purity)) %>% 
  filter(SP_Group_New %in% c('AS_N','EU_N','EU_S','Others')) %>% 
  left_join(tmp)

tdata %>% do(tidy(wilcox.test(Latency~Gender,data=.)))
## # A tibble: 1 × 4
##   statistic p.value method                                           alternative
##       <dbl>   <dbl> <chr>                                            <chr>      
## 1     25393 0.00167 Wilcoxon rank sum test with continuity correcti… two.sided
tmp <- tdata %>% 
  group_by(SP_Group_New) %>% do(tidy(wilcox.test(Latency~Gender,data=.))) %>% 
  mutate(label=paste0(SP_Group_New,'\nP=',scientific_format()(p.value))) %>%  select(SP_Group_New,label)

tdata %>% 
  left_join(tmp) %>% 
  ggplot(aes(Gender,Latency,fill=Gender))+
  geom_point(pch=21,size=2,position=position_jitterdodge(jitter.width = 0.2,dodge.width = 0.5),color="black")+
  geom_boxplot(width=0.5,outlier.shape = NA,alpha =0.25)+
  scale_fill_manual(values = c("Female" = "#4daf4a","Male" = "#cccccc"))+
  #scale_fill_manual(values = sigcol)+
  theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14,grid = "Y",grid_col = "gray90",ticks = T)+
  #theme(axis.text.x = element_text(angle = 30,hjust = 1,vjust = 1),legend.position = 'top',panel.spacing.x = unit(0.1,'cm'))+
  theme(axis.text.x = element_text(angle = 30,hjust = 1,vjust = 1),legend.position = 'bottom',panel.spacing.x = unit(0.1,'cm'),strip.text.x = element_text(hjust = 0.5,face = 2))+
  labs(x = "", y = 'Latency, years before diagnosis',fill='Sex')+
  scale_y_continuous(breaks = pretty_breaks())+ #,limits = c(-1.5,32)
  facet_grid(.~label,scales = 'free',space='free')+
  panel_border(color = 'black',linetype = 1)

#ggsave(filename = './output/latency_gender_signatures.pdf',width = 6,height = 5.8,device = cairo_pdf)


p1 <- tdata %>% 
  filter(SP_Group_New == 'EU_N') %>% 
  ggplot(aes(Gender,Latency,fill=Gender))+
  geom_point(pch=21,size=2,position=position_jitterdodge(jitter.width = 0.2,dodge.width = 0.5),color="black")+
  geom_boxplot(width=0.5,outlier.shape = NA,alpha =0.25)+
  scale_fill_manual(values = c("Female" = "#984ea3","Male" = "#cccccc"))+
  #scale_fill_manual(values = sigcol)+
  theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14,grid = "Y",grid_col = "gray90",ticks = T)+
  theme(axis.text.x = element_text(angle = 30,hjust = 1,vjust = 1),legend.position = 'top',panel.spacing.x = unit(0.1,'cm'))+
  labs(x = "", y = 'Latency, years before diagnosis',fill='Sex')+
  scale_y_continuous(breaks = pretty_breaks())+ #,limits = c(-1.5,32)
  facet_grid(.~EGFR,scales = 'free',space='free')+
  panel_border(color = 'black',linetype = 1)
p1

p2 <- tdata %>% 
  filter(SP_Group_New == 'EU_N') %>% 
  ggplot(aes(EGFR,Latency,fill=EGFR))+
  geom_point(pch=21,size=2,position=position_jitterdodge(jitter.width = 0.2,dodge.width = 0.5),color="black")+
  geom_boxplot(width=0.5,outlier.shape = NA,alpha =0.25)+
  scale_fill_manual(values = c("Yes" = "#4daf4a","No" = "#cccccc"))+
  #scale_fill_manual(values = sigcol)+
  theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14,grid = "Y",grid_col = "gray90",ticks = T)+
  theme(axis.text.x = element_text(angle = 30,hjust = 1,vjust = 1),legend.position = 'top',panel.spacing.x = unit(0.1,'cm'))+
  labs(x = "", y = 'Latency, years before diagnosis',fill='EGFR Mutation')+
  scale_y_continuous(breaks = pretty_breaks())+ #,limits = c(-1.5,32)
  facet_grid(.~Gender,scales = 'free',space='free')+
  panel_border(color = 'black',linetype = 1)
p2

plot_grid(p1,p2,align = 'v',axis = 'lr')

#ggsave(filename = './output/latency_gender_EGFR_signatures2.pdf',width = 7.5,height = 5.5,device = cairo_pdf)

tdata %>% 
  filter(SP_Group_New=='EU_N') %>%
  group_by(Gender) %>% 
  do(tidy(wilcox.test(Latency~EGFR,data=.)))
## # A tibble: 2 × 5
## # Groups:   Gender [2]
##   Gender statistic  p.value method                                   alternative
##   <fct>      <dbl>    <dbl> <chr>                                    <chr>      
## 1 Male         115 0.694    Wilcoxon rank sum exact test             two.sided  
## 2 Female      1649 0.000144 Wilcoxon rank sum test with continuity … two.sided
tdata %>% 
  filter(SP_Group_New=='EU_N') %>%
  group_by(EGFR) %>% 
  do(tidy(wilcox.test(Latency~Gender,data=.)))
## # A tibble: 2 × 5
## # Groups:   EGFR [2]
##   EGFR  statistic p.value method                                     alternative
##   <chr>     <dbl>   <dbl> <chr>                                      <chr>      
## 1 No          606 0.332   Wilcoxon rank sum test with continuity co… two.sided  
## 2 Yes         225 0.00275 Wilcoxon rank sum test with continuity co… two.sided
tdata %>% 
  group_by(SP_Group_New) %>% 
  do(tidy(wilcox.test(Latency~Gender,data=.)))
## # A tibble: 4 × 5
## # Groups:   SP_Group_New [4]
##   SP_Group_New statistic p.value method                              alternative
##   <chr>            <dbl>   <dbl> <chr>                               <chr>      
## 1 AS_N              1899  0.911  Wilcoxon rank sum test with contin… two.sided  
## 2 EU_N              1646  0.0103 Wilcoxon rank sum test with contin… two.sided  
## 3 EU_S              1012  0.758  Wilcoxon rank sum test with contin… two.sided  
## 4 Others             271  0.0510 Wilcoxon rank sum exact test        two.sided
tmpresult <- tdata %>% 
  group_by(SP_Group_New) %>% 
  do(tidy(lm(Latency~Gender+EGFR+Tumor_Purity,data=.),conf.int = TRUE, conf.level = 0.95)) %>% 
  filter(term!='(Intercept)') %>% 
  ungroup() %>% 
  arrange(p.value) %>% 
  mutate(term=str_remove(term,"EGFR")) %>% 
  mutate(term=str_remove(term,"Histology")) %>% 
  mutate(term=str_remove(term,"Gender"))

tmpresult <- tmpresult %>% mutate(label=paste0('β = ',round(estimate,2), ', p = ',scientific_format(digits = 3)(p.value))) 
# %>%
#   bind_rows(
#     tibble(term=c("No","Male"),label = "",conf.low = 0, conf.high=0,estimate=0,p.value = 1) %>% mutate(SP_Group_New='AS_N'),
#     tibble(term=c("No","Male"),label = "",conf.low = 0, conf.high=0,estimate=0,p.value = 1) %>% mutate(SP_Group_New='EU_N'),
#     tibble(term=c("No","Male"),label = "",conf.low = 0, conf.high=0,estimate=0,p.value = 1) %>% mutate(SP_Group_New='EU_S')
#   )


# tmplevels <- c('Tumor_Purity','Male','Female','No','Yes')
# tmplables <- c('Tumor purity', 'Male','Female','EGFR wildtype','EGFR mutant')

tmplevels <- c('Tumor_Purity','Female','Yes')
tmplables <- c('Tumor purity', 'Female\nRef=Male','EGFR Mutant\nRef=Wildtype')

tmpresult %>% 
  mutate(term=factor(term,levels=tmplevels,labels=tmplables)) %>% 
  ggplot(aes(estimate, term, xmin = conf.low, xmax = conf.high, height = 0,color=p.value<0.05)) +
  geom_point(pch=19,size=3) +
  geom_vline(xintercept = 0, lty = 4) +
  geom_errorbarh(height = .3)+
  scale_color_manual(values = c("FALSE"='black',"TRUE"='red'))+
  ggrepel::geom_text_repel(aes(label=label),size=3.5,nudge_y = 0.2)+
  facet_grid(~SP_Group_New)+
  theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14)+
  theme(panel.spacing = unit(0.2,"cm"))+
  labs(x = "Regression coefficient for tumor latency", y = NULL)+
  guides(color="none")+
  scale_x_continuous(breaks = pretty_breaks())+
  panel_border(color = 'black',linetype = 1)

#ggsave(filename = './output/latency_gender_EGFR_signatures.pdf',width = 14,height = 3,device = cairo_pdf)

7 Fig. 2e - Latency association (Multivarible regression)

A multivariate regression analysis examining the relationship between tumor latency and various factors, including sex, ancestry, smoking, EGFR mutations, KRAS mutations, and mutational signature ID2. Statistically significant associations (P<0.05) are denoted in red.

tmp1 <- sherlock_data_full %>% 
  filter(Type=='Mutation_Driver',Gene=='EGFR') %>% 
  select(Tumor_Barcode,EGFR=Alteration)

tmp2 <- sherlock_data_full %>% 
  filter(Type=='Mutation_Driver',Gene=='KRAS') %>% 
  select(Tumor_Barcode,KRAS=Alteration)

tmp3 <- sherlock_data_full %>% 
  filter(Type=='Mutation_Driver',Gene=='TP53') %>% 
  select(Tumor_Barcode,TP53=Alteration)

tdata <- MRCAdata %>% 
  filter(Tumor_Barcode %in% hq_samples2) %>% 
  filter(acceleration=='1x', Tumor_Barcode %in% hq_samples2) %>%
  left_join(sp_group_data2) %>% 
  left_join(covdata0 %>% select(Tumor_Barcode,Gender,Smoking,Assigned_Population,Tumor_Purity)) %>% 
  left_join(id2data %>% select(Tumor_Barcode, ID2_Present)) %>% 
  #filter(SP_Group_New %in% c('AS_N','EU_N','EU_S')) %>% 
  left_join(tmp1) %>% 
  left_join(tmp2) %>% 
  left_join(tmp3) %>% 
  mutate(Smoking = if_else(Smoking == 'Unknown',NA_character_,Smoking)) %>% 
  mutate(Smoking = factor(Smoking, levels=c('Smoker','Non-Smoker')))


tmpresult <- tdata %>% 
  do(tidy(lm(Latency~Gender+Assigned_Population+Smoking+ID2_Present+EGFR+KRAS+Tumor_Purity,data=.),conf.int = TRUE, conf.level = 0.95)) %>% 
  filter(term!='(Intercept)') %>% 
  ungroup() %>% 
  arrange(desc(estimate)) %>% 
  #mutate(term=str_remove(term,"EGFR")) %>% 
  mutate(term=str_remove(term,"Histology")) %>% 
  mutate(term=str_remove(term,"Gender")) %>% 
  mutate(label=paste0('β = ',round(estimate,2), ', p = ',scientific_format(digits = 3)(p.value)))

# tmpresult <- tmpresult %>% mutate(label=paste0('β = ',round(estimate,2), ', p = ',scientific_format(digits = 3)(p.value))) %>%
#   bind_rows(
#     tibble(term=c("No","Male"),label = "",conf.low = 0, conf.high=0,estimate=0,p.value = 1) %>% mutate(SP_Group_New='AS_N'),
#     tibble(term=c("No","Male"),label = "",conf.low = 0, conf.high=0,estimate=0,p.value = 1) %>% mutate(SP_Group_New='EU_N'),
#     tibble(term=c("No","Male"),label = "",conf.low = 0, conf.high=0,estimate=0,p.value = 1) %>% mutate(SP_Group_New='EU_S')
#   )
# tmplevels <- c('Tumor_Purity','Male','Female','No','Yes')
# 
# tmplables <- c('Tumor purity', 'Male','Female','EGFR wildtype','EGFR mutant')

tmplevels <-  tmpresult$term
tmplables <- tmpresult$term

#tmplables <- c('EGFR Mutant\nRef=Wildtype','Female\nRef=Male','Smokers\nRef=Nonsmokers','Tumor Purity','Ancestry EAS\nRef=EUR','Ancestry Others\nRef=EUR','KRAS Mutant\nRef=Wildtype','ID2 Signature Present\nRef=Absent')

tmplables <- c('EGFR Mutant\nRef=Wildtype','Female\nRef=Male','Tumor Purity','Ancestry EAS\nRef=EUR','Nonsmokers\nRef=Smokers','Ancestry Others\nRef=EUR','KRAS Mutant\nRef=Wildtype','ID2 Signature Present\nRef=Absent')

tmpresult %>% 
  mutate(p.value = if_else(p.value>0.05,0.5,p.value)) %>% 
  mutate(term=factor(term,levels=tmplevels,labels=tmplables)) %>% 
  ggplot(aes(estimate, term, xmin = conf.low, xmax = conf.high, height = 0,color=-log10(p.value))) +
  geom_vline(xintercept = 0, lty = 5) +
  geom_errorbarh(height = .3)+
  geom_point(pch=19,size=4) +
  #scale_color_manual(values = c("FALSE"='black',"TRUE"='red'))+
  scale_color_material(palette = 'red')+
  ggrepel::geom_text_repel(aes(label=label),size=3.5,nudge_y = 0.4)+
  #facet_grid(~SP_Group_New)+
  theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14,grid_col = 'gray85')+
  theme(panel.spacing = unit(0.2,"cm"))+
  labs(x = "Regression coefficient", y = NULL)+
  guides(color="none")+
  scale_x_continuous(breaks = pretty_breaks())+
  panel_border(color = 'black',linetype = 1)

##ggsave(filename = './output/latency_multivariables.pdf',width = 8,height = 5,device = cairo_pdf)

summary(lm(Latency~Gender+Assigned_Population+Smoking+Tumor_Purity+Age,data=tdata))$r.squared
## [1] 0.06051831
summary(lm(Latency~Gender+Assigned_Population+Smoking+ID2_Present+Tumor_Purity+Age,data=tdata))$r.squared
## [1] 0.08865073
summary(lm(Latency~Gender+Assigned_Population+Smoking+EGFR+KRAS+Tumor_Purity+Age,data=tdata))$r.squared
## [1] 0.08998911
summary(lm(Latency~Gender+Assigned_Population+Smoking+ID2_Present+EGFR+KRAS+Tumor_Purity+Age,data=tdata))$r.squared
## [1] 0.1173433
tdata <- tdata %>% filter(Smoking == 'Non-Smoker',Assigned_Population %in% c('EAS','EUR'))
#summary(lm(Latency~SP_Group_New+ID2_Present + KRAS +EGFR+SP_Group_New:EGFR + Tumor_Purity + Gender,data=tdata)) %>% tidy() %>% arrange(p.value)
summary(lm(Latency~Assigned_Population+EGFR+EGFR:Assigned_Population + Tumor_Purity + Gender,data=tdata)) %>% tidy() %>% arrange(p.value)
## # A tibble: 6 × 5
##   term                           estimate std.error statistic   p.value
##   <chr>                             <dbl>     <dbl>     <dbl>     <dbl>
## 1 EGFRYes                           7.73       1.89     4.08  0.0000553
## 2 (Intercept)                      10.1        3.24     3.10  0.00205  
## 3 GenderFemale                      3.70       1.79     2.06  0.0399   
## 4 Assigned_PopulationEAS:EGFRYes   -4.97       2.82    -1.76  0.0790   
## 5 Tumor_Purity                      5.11       5.03     1.02  0.310    
## 6 Assigned_PopulationEAS            0.850      2.16     0.394 0.694

8 Additional Supplementary Figures

# Supplementary Fig. 12 ---------------------------------------------------

#Supplementary Fig. 12a, overall 
mrate %>% 
  left_join(sp_group_data) %>%
  #filter(Tumor_Barcode %in% hq_samples2) %>% 
  #filter(SP_Group!="Others") %>% 
  drop_na() %>% 
  ggplot(aes(age,log2(CpG.TpG.Gb)))+
  geom_point(pch=21,size=2,fill=ncicolpal[14]) +
  geom_smooth(method = 'lm')+
  #facet_wrap(~SP_Group_New,nrow = 1,scales = 'free_x')+
  scale_fill_manual(values = sp_group_color_new)+
  theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14)+
  labs(x = "Age at diagnosis (years)", y = "SNVs/Gb (log2)")+
  guides(fill="none")+
  scale_x_continuous(breaks = pretty_breaks())+
  panel_border(color = 'black',linetype = 1)

#ggsave(filename = './output/mutation_rate_age_all.pdf',width = 4,height = 3.6,device = cairo_pdf)

# Supplementary Fig. 12b, seperated by the group
mrate %>% 
  left_join(sp_group_data) %>% 
  #filter(SP_Group!="Others") %>% 
  #filter(Tumor_Barcode %in% hq_samples2) %>% 
  drop_na() %>% 
  ggplot(aes(age,log2(CpG.TpG.Gb),fill=SP_Group_New))+
  geom_point(pch=21,size=2) +
  geom_smooth(method = 'lm')+
  facet_wrap(~SP_Group_New,nrow = 1,scales = 'free_x')+
  scale_fill_manual(values = sp_group_color_new)+
  theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14)+
  labs(x = "Age at diagnosis (years)", y = "SNVs/Gb (log2)")+
  guides(fill="none")+
  scale_x_continuous(breaks = pretty_breaks())+
  panel_border(color = 'black',linetype = 1)

#ggsave(filename = './output/mutation_rate_age.pdf',width = 12,height = 3.5,device = cairo_pdf)

mrate %>% do(tidy(lm(log2(CpG.TpG.Gb)~age,data=.))) %>% filter(term!="(Intercept)") 
## # A tibble: 1 × 5
##   term  estimate std.error statistic   p.value
##   <chr>    <dbl>     <dbl>     <dbl>     <dbl>
## 1 age     0.0167   0.00389      4.29 0.0000209
mrate %>% group_by(SP_Group) %>% do(tidy(lm(log2(CpG.TpG.Gb)~age,data=.))) %>% filter(term!="(Intercept)") %>% ungroup() %>% arrange(p.value)
## # A tibble: 4 × 6
##   SP_Group term  estimate std.error statistic    p.value
##   <chr>    <chr>    <dbl>     <dbl>     <dbl>      <dbl>
## 1 N_A      age    0.0246    0.00508     4.85  0.00000267
## 2 N_U      age    0.0102    0.00485     2.10  0.0372    
## 3 Others   age    0.0160    0.0124      1.29  0.205     
## 4 S_U      age   -0.00410   0.0104     -0.395 0.694
#mrate %>% left_join(clinical_data) %>% filter(Histology == 'Adenocarcinoma') %>% group_by(SP_Group) %>% do(tidy(lm(log2(CpG.TpG.Gb)~age,data=.))) %>% filter(term!="(Intercept)") %>% ungroup() %>% arrange(p.value)
#sorder <- read_rds('../SmokingVar/sorder.RDS')
#mrate %>% left_join(sorder) %>% filter(!is.na(APOBEC_Signature)) %>% group_by(APOBEC_Signature) %>% do(tidy(lm(CpG.TpG.Gb~age,data=.))) %>% filter(term!="(Intercept)") %>% ungroup() %>% arrange(p.value)


# Supplementary Fi. 13a ---------------------------------------------------
# average rate as barplot
as.data.frame(qRateDeam) %>% 
  rownames_to_column() %>% 
  pivot_longer(cols = -rowname) %>% 
  mutate(rowname = paste0("d",str_remove(rowname,"%"))) %>% 
  pivot_wider(names_from = rowname,values_from = value) %>% 
  filter(name!="Others") %>% 
  dplyr::rename(SP_Group=name) %>% 
  left_join(sp_group_data) %>% 
  ggplot(aes(SP_Group_New,d50,fill=SP_Group_New))+
  geom_col(width = 0.7)+
  scale_fill_manual(values = sp_group_color_new)+
  guides(fill="none")+
  geom_errorbar(aes(ymin=d0,ymax=d100),width=0.1,size=0.7)+
  labs(y = "CpG>TpG rate [SNVs/Gb/yr]", x= "")+
  theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14)+
  panel_border(color = 'black',linetype = 1)

#ggsave(filename = './output/mutation_rate_age_average.pdf',width = 3,height = 4,device = cairo_pdf)

# Latency Plot by differnt accelration rate ------------------------------------------------------------
u <- base::setdiff(names(finalSnv), remove)
guessAccel <- sapply(subclonesTimeAbs, function(x) "5x")
qSubclone <- sapply(subclonesTimeAbs, function(x) apply(x[,"hat",][rownames(x)%in%u,,drop=FALSE], 2, quantile, c(0.05,0.25,0.5,0.75,0.95), na.rm=TRUE), simplify='array')
subclonesTimeAbsType <- sapply(names(subclonesTimeAbs), function(n) {x <- subclonesTimeAbs[[n]]; x[,,guessAccel[n]][setdiff(rownames(x),remove), 1:3, drop=FALSE]})
nSubclones <- sapply(subclonesTimeAbsType, function(x) sum(!is.na(x[,1])))

qSubclone <- qSubclone[,,-2]

m <- qSubclone["50%","5x",]#t[1,3,]
names(m) <- dimnames(qSubclone)[[3]]
m[nSubclones < 5] <- NA
o <- rev(order(m, na.last=NA))
n <- dimnames(qSubclone)[[3]]

#cairo_pdf(filename = 'Acceleration_alterative.pdf',width = 3,height = 5)
par(mar=c(3,3,1,1), mgp=c(2,.5,0), tcl=0.25,cex=1, bty="L", xpd=FALSE, las=1)
plot(NA,NA, xlim=c(0.5,length(m[o])+0.5), ylab="Latency [yr]", xlab="", xaxt="n", yaxs="i", ylim=c(0,18))
abline(h=seq(10,20,10), col="#DDDDDD", lty=3)
x <- seq_along(m[o])
mg14::rotatedLabel(x, labels=as.character(sp_group_lift[names(rev(sort(m)))]))
b <- .3
rect(seq_along(o)-b,qSubclone["50%","1x",o],seq_along(o)+b,qSubclone["50%","10x",o], col=alpha(as.character(sp_group_color[n[o]]),0.4), border=1)
rect(seq_along(o)-b,qSubclone["50%","2.5x",o],seq_along(o)+b,qSubclone["50%","7.5x",o], col=alpha(as.character(sp_group_color[n[o]]),0.9), border=1)
rect(seq_along(o)-b,qSubclone["50%","20x",o],seq_along(o)+b,qSubclone["50%","10x",o], col=alpha(as.character(sp_group_color[n[o]]),0.2), border=1)
segments(seq_along(o)-b,qSubclone["50%","5x",o],seq_along(o)+b,qSubclone["50%","5x",o])

#dev.off()


# Tumor latency vs proliferation maker ---------------------
tmp <- rdata1 %>% 
  filter(RNAseq_Type == 'Tumor', Gene %in% c('MYBL2','BUB1','PLK1','CCNE1','CCNB1','BUB1','FOXM1','TOP2A','MKI67'))

tdata <- MRCAdata %>% 
  filter(acceleration == "1x") %>% 
  mutate(Group=if_else(Latency>8,'High','Low')) %>% 
  left_join(tmp) %>% 
  left_join(sp_group_data2) %>% 
  filter(!is.na(Exp),!is.na(Latency))


tdata %>% group_by(Gene) %>% do(tidy(wilcox.test(Exp~Group,data=.))) %>% ungroup() %>%  mutate(FDR=p.adjust(p.value))
## # A tibble: 8 × 6
##   Gene  statistic  p.value method                            alternative     FDR
##   <chr>     <dbl>    <dbl> <chr>                             <chr>         <dbl>
## 1 BUB1      10476 0.0109   Wilcoxon rank sum test with cont… two.sided   0.0394 
## 2 CCNB1      9769 0.000787 Wilcoxon rank sum test with cont… two.sided   0.00630
## 3 CCNE1     10440 0.00970  Wilcoxon rank sum test with cont… two.sided   0.0394 
## 4 FOXM1     10193 0.00410  Wilcoxon rank sum test with cont… two.sided   0.0246 
## 5 MKI67     10505 0.0120   Wilcoxon rank sum test with cont… two.sided   0.0394 
## 6 MYBL2     10378 0.00787  Wilcoxon rank sum test with cont… two.sided   0.0394 
## 7 PLK1      10084 0.00274  Wilcoxon rank sum test with cont… two.sided   0.0192 
## 8 TOP2A     10440 0.00970  Wilcoxon rank sum test with cont… two.sided   0.0394
my_comparisons <- list(c("High",'Low'))
tdata%>% 
  mutate(Group=fct_rev(Group)) %>% 
  ggplot(aes(Group,Exp,fill=Group))+
  geom_boxplot(width=0.7,outlier.shape = NA,alpha =0.25)+
  geom_point(pch=21,size=1.5,position = position_jitter(width = 0.15,height = 0.05),color="white",stroke=0.02)+
  scale_fill_jama()+
  facet_wrap(~Gene,nrow = 1)+
  theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14)+
  theme(plot.margin = margin(4,4,4,4),panel.spacing = unit(0.1,'cm'),strip.text.x = element_text(hjust = 0.5,face = 4))+
  labs(x = "Tumor latency", y = 'RNA-Seq expression log2(CPM)')+
  guides(fill="none")+
  panel_border(color = 'black',linetype = 1)+
  stat_compare_means(comparisons = my_comparisons)

#ggsave(filename = './output/latency_proliferation_expression.pdf',width = 8,height = 6,device = cairo_pdf)



# TP53 vs Tumor latenc/MRCA age
tdata <- MRCAdata %>% 
  filter(acceleration == "1x") %>% 
  mutate(Group=if_else(Latency>8,'High','Low')) %>% 
  left_join(
    sherlock_data_full %>% filter(Gene=='TP53',Type=='Mutation_Driver') %>% select(Tumor_Barcode,TP53=Alteration)
  ) %>% 
  left_join(sp_group_data2)

tdata <- tdata %>% mutate(SP_Group_New = 'ALL') %>% bind_rows(tdata)

my_comparisons <- list(c("Yes",'No'))

tdata %>% 
  ggplot(aes(TP53,MRCA_age,fill=TP53))+
  geom_boxplot(width=0.7,outlier.shape = NA,alpha =0.25)+
  geom_point(pch=21,size=1.5,position = position_jitter(width = 0.15,height = 0.05),color="white",stroke=0.02)+
  scale_fill_jama()+
  scale_y_continuous(breaks = pretty_breaks(n = 7))+
  facet_wrap(~SP_Group_New,nrow = 1)+
  theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14)+
  theme(plot.margin = margin(4,4,4,4),panel.spacing = unit(0.1,'cm'),strip.text.x = element_text(hjust = 0.5,face = 4))+
  labs(x = "TP53 mutation", y = 'Tumor MRCA years')+
  guides(fill="none")+
  panel_border(color = 'black',linetype = 1)+
  stat_compare_means(comparisons = my_comparisons)

#ggsave(filename = './output/TP53_MRCA_years.pdf',width = 5,height = 5,device = cairo_pdf)


tmp <- rdata1 %>% 
  filter(RNAseq_Type == 'Tumor', Gene %in% c('MYBL2','BUB1','PLK1','CCNE1','CCNB1','BUB1','FOXM1','TOP2A','MKI67'))

sherlock_data_full %>%
  filter(Gene=='TP53',Type=='Mutation_Driver') %>% 
  select(Tumor_Barcode,TP53=Alteration) %>%
  left_join(tmp) %>% 
  left_join(sp_group_data2) %>% 
  filter(Tumor_Barcode %in% hq_samples2,!is.na(Exp)) %>% 
  ggplot(aes(TP53,Exp,fill=TP53))+
  geom_boxplot(width=0.7,outlier.shape = NA,alpha =0.25)+
  geom_point(pch=21,size=1.5,position = position_jitter(width = 0.15,height = 0.05),color="white",stroke=0.02)+
  scale_fill_jama()+
  scale_y_continuous(breaks = pretty_breaks(n = 7))+
  facet_wrap(~Gene,nrow = 1)+
  theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14)+
  theme(plot.margin = margin(4,4,4,4),panel.spacing = unit(0.1,'cm'),strip.text.x = element_text(hjust = 0.5,face = 4))+
  labs(x = "TP53 mutation", y = 'RNA-Seq expression log2(CPM)')+
  guides(fill="none")+
  panel_border(color = 'black',linetype = 1)

  #stat_compare_means(comparisons = my_comparisons)

#ggsave(filename = './output/TP53_proliferation_expression.pdf',width = 9,height = 6,device = cairo_pdf)

# Latency difference between ID2 present vs ID2 absent in KRAS mutant group --------------------------
my_comparisons <- list(c("Absent",'Present'))
MRCAdata %>% 
  filter(acceleration == '1x') %>% 
  left_join(id2data) %>% 
  left_join(
    sherlock_data_full %>%
      filter(Gene=='KRAS',Type=='Mutation_Driver') %>% 
      select(Tumor_Barcode,KRAS=Alteration)
  ) %>% 
  left_join(sp_group_data2) %>% 
  filter(KRAS=='Yes',Tumor_Barcode %in% hq_samples2) %>% 
  filter(SP_Group_New %in% c('EU_N','EU_S')) %>% 
  ggplot(aes(ID2_Present,Latency,fill=ID2_Present))+
  geom_boxplot(width=0.6,outlier.shape = NA,alpha =0.25)+
  geom_point(pch=21,size=3,position = position_jitter(width = 0.15,height = 0),color="black",stroke=0.02)+
  scale_fill_manual(values = id2color)+
  facet_wrap(~SP_Group_New)+
  scale_y_continuous(breaks = pretty_breaks(n = 7))+
  theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14)+
  theme(plot.margin = margin(4,4,4,4),panel.spacing = unit(0.1,'cm'),strip.text.x = element_text(hjust = 0.5,face = 4))+
  labs(x = "Mutational signature ID2", y = 'Tumor latency years')+
  guides(fill="none")+
  panel_border(color = 'black',linetype = 1)+
  stat_compare_means(comparisons = my_comparisons,method = 't.test',method.args = list(alternative='greater'))

#ggsave(filename = './output/KRAS_ID2_present_vs_absent_latency.pdf',width = 5,height = 6,device = cairo_pdf)

# Latency difference between AS_N and EU_N in the EGFR mutant group --------------------------
my_comparisons <- list(c("AS_N",'EU_N'))
MRCAdata %>% 
  filter(acceleration == '1x') %>% 
  left_join(id2data) %>% 
  left_join(
    sherlock_data_full %>%
      filter(Gene=='EGFR',Type=='Mutation_Driver') %>% 
      select(Tumor_Barcode,EGFR=Alteration) %>% 
      mutate(EGFR=if_else(EGFR=='Yes','EGFR mutant','EGFR wildtype'))
  ) %>% 
  left_join(sp_group_data2) %>% 
  filter(Tumor_Barcode %in% hq_samples2, SP_Group_New %in% c('AS_N','EU_N'),!is.na(Latency)) %>% 
  ggplot(aes(SP_Group_New,Latency,fill=SP_Group_New))+
  geom_boxplot(width=0.6,outlier.shape = NA,alpha =0.25)+
  geom_point(pch=21,size=3,position = position_jitter(width = 0.15,height = 0),color="black",stroke=0.02)+
  facet_wrap(~EGFR)+
  scale_fill_manual(values = sp_group_color_new)+
  scale_y_continuous(breaks = pretty_breaks(n = 7))+
  theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14)+
  theme(plot.margin = margin(4,4,4,4),panel.spacing = unit(0.1,'cm'),strip.text.x = element_text(hjust = 0.5,face = 4),axis.title.x = element_blank())+
  labs(y = 'Tumor latency years')+
  guides(fill="none")+
  panel_border(color = 'black',linetype = 1)+
  stat_compare_means(comparisons = my_comparisons)

#ggsave(filename = './output/EGFR_SP_Group_latency.pdf',width = 4,height = 6,device = cairo_pdf)


# SP_group Latency difference----------------------------------------------------------------
my_comparisons <- list( c("AS_N", "EU_N"), c("AS_N", "EU_S"), c("EU_N", "EU_S") )
my_comparisons <- list( c("AS_N", "EU_N"), c("AS_N", "EU_S"), c("EU_N", "EU_S"),c("AS_N", "Others"), c("Others", "EU_S"), c("EU_N", "Others") )

MRCAdata %>% 
  filter(acceleration == "1x") %>% 
  group_by(SP_Group) %>% 
  summarise(Latency =median(Latency,na.rm = T),Age =median(Age,na.rm = T),MRCA_age =median(MRCA_age,na.rm = T))
## # A tibble: 4 × 4
##   SP_Group Latency   Age MRCA_age
##   <chr>      <dbl> <dbl>    <dbl>
## 1 N_A         16.8  63       43.8
## 2 N_U         14.9  67.9     48.0
## 3 Others      11.0  66.2     52.4
## 4 S_U         14.8  66.6     50.3
MRCAdata %>% 
  left_join(sp_group_data) %>% 
  filter(acceleration == "1x") %>% 
  filter(!is.na(Latency)) %>% 
  #filter(SP_Group!="Others") %>% 
  ggplot(aes(SP_Group_New,Latency,fill=SP_Group_New))+
  geom_boxplot(width=0.5,outlier.shape = NA,alpha =0.25)+
  geom_point(pch=21,size=1.5,position = position_jitter(width = 0.15,height = 0.05),color="black")+
  scale_fill_manual(values = sp_group_color_new)+
  theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14)+
  labs(x = "", y = 'Latency, years before diagnosis')+
  guides(fill="none")+
  panel_border(color = 'black',linetype = 1)+
  stat_compare_means(comparisons = my_comparisons)

#ggsave(filename = './output/latency_spgroup_1x.pdf',width = 4,height = 6,device = cairo_pdf)

MRCAdata %>% 
  left_join(sp_group_data) %>% 
  filter(acceleration == "1x") %>% 
  #filter((SP_Group %in% c('AS_N','EU_N') & acceleration == "1x")|(SP_Group %in% c('EU_S') & acceleration == "7.5x")) %>% 
  filter(!is.na(Latency)) %>% 
  #filter(SP_Group!="Others") %>% 
  ggplot(aes(SP_Group_New,MRCA_age,fill=SP_Group_New))+
  geom_boxplot(width=0.5,outlier.shape = NA,alpha =0.25)+
  geom_point(pch=21,size=1.5,position = position_jitter(width = 0.15,height = 0),color="black")+
  scale_fill_manual(values = sp_group_color_new)+
  theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14)+
  labs(x = "", y = 'MRCA Age (years)')+
  guides(fill="none")+
  panel_border(color = 'black',linetype = 1)+
  stat_compare_means(comparisons = my_comparisons)#+

#stat_compare_means(label.y = 100) 

#ggsave(filename = './output/MRCA_spgroup.pdf',width = 4,height = 6,device = cairo_pdf)

MRCAdata %>% 
  left_join(sp_group_data) %>% 
  filter(acceleration == "1x") %>% 
  #filter((SP_Group %in% c('AS_N','EU_N') & acceleration == "1x")|(SP_Group %in% c('EU_S') & acceleration == "7.5x")) %>% 
  filter(!is.na(Latency)) %>% 
  #filter(SP_Group!="Others") %>% 
  ggplot(aes(SP_Group_New,Age,fill=SP_Group_New))+
  geom_boxplot(width=0.5,outlier.shape = NA,alpha =0.25)+
  geom_point(pch=21,size=1.5,position = position_jitter(width = 0.15,height = 0),color="black")+
  scale_fill_manual(values = sp_group_color_new)+
  theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14)+
  labs(x = "", y = 'Age at diagnosis (years)')+
  guides(fill="none")+
  panel_border(color = 'black',linetype = 1)+
  stat_compare_means(comparisons = my_comparisons)

#ggsave(filename = './output/MRCA_spgroup_age.pdf',width = 4,height = 6,device = cairo_pdf)

MRCAdata %>% 
  left_join(sp_group_data) %>% 
  filter(acceleration == "1x") %>% 
  #filter((SP_Group %in% c('AS_N','EU_N') & acceleration == "1x")|(SP_Group %in% c('EU_S') & acceleration == "5x")) %>% 
  filter(!is.na(Latency)) %>% 
  filter(SP_Group!="Others") %>% 
  filter(SP_Group!='N_U') %>% 
  do(tidy(wilcox.test(MRCA_age~SP_Group,data=.)))
## # A tibble: 1 × 4
##   statistic p.value method                                           alternative
##       <dbl>   <dbl> <chr>                                            <chr>      
## 1      7777 0.00103 Wilcoxon rank sum test with continuity correcti… two.sided
# WGD latency -------------------------------------------------------------
my_comparisons <- list( c("WGD", "nWGD"))

MRCAdata %>% 
  left_join(sp_group_data) %>% 
  filter(acceleration == "1x") %>% 
  filter(!is.na(Latency)) %>% 
  filter(SP_Group!="Others") %>% 
  left_join(BBsolution4 %>% select(Tumor_Barcode,WGD_Status)) %>% 
  ggplot(aes(WGD_Status,Latency,fill=WGD_Status))+
  geom_boxplot(width=0.5,outlier.shape = NA,alpha =0.25)+
  geom_point(pch=21,size=1.5,position = position_jitter(width = 0.15,height = 0.05),color="black")+
  facet_wrap(~SP_Group_New)+
  scale_fill_jama()+
  theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14)+
  theme(panel.spacing = unit(0.1,"cm"))+
  labs(x = "", y = 'Latency, years before diagnosis')+
  guides(fill="none")+
  panel_border(color = 'black',linetype = 1)+
  stat_compare_means(comparisons = my_comparisons)

#ggsave(filename = './output/latency_WGD_spgroup.pdf',width = 5,height = 6,device = cairo_pdf)

9 R session information

sessionInfo()
## R version 4.3.2 (2023-10-31)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS 15.5
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] ggpubr_0.6.1       ggsci_3.2.0        broom_1.0.8        ggpmisc_0.6.2     
##  [5] ggpp_0.5.9         ggasym_0.1.6       data.table_1.17.8  ggnewscale_0.5.2  
##  [9] ggrepel_0.9.6      cowplot_1.2.0      hrbrthemes_0.8.7   scales_1.4.0      
## [13] ggsankey_0.0.99999 lubridate_1.9.4    forcats_1.0.0      stringr_1.5.1     
## [17] dplyr_1.1.4        purrr_1.0.4        readr_2.1.5        tidyr_1.3.1       
## [21] tibble_3.3.0       ggplot2_3.5.2      tidyverse_2.0.0   
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6            xfun_0.52               bslib_0.9.0            
##  [4] rstatix_0.7.2           lattice_0.22-7          tzdb_0.5.0             
##  [7] vctrs_0.6.5             tools_4.3.2             generics_0.1.4         
## [10] pkgconfig_2.0.3         Matrix_1.6-5            RColorBrewer_1.1-3     
## [13] lifecycle_1.0.4         compiler_4.3.2          farver_2.1.2           
## [16] MatrixModels_0.5-4      carData_3.0-5           SparseM_1.84-2         
## [19] fontquiver_0.2.1        fontLiberation_0.1.0    quantreg_6.1           
## [22] htmltools_0.5.8.1       sass_0.4.10             yaml_2.3.10            
## [25] Formula_1.2-5           Rttf2pt1_1.3.12         car_3.1-3              
## [28] pillar_1.11.0           jquerylib_0.1.4         extrafontdb_1.0        
## [31] MASS_7.3-60.0.1         cachem_1.1.0            mg14_0.0.6             
## [34] abind_1.4-8             nlme_3.1-168            fontBitstreamVera_0.1.1
## [37] tidyselect_1.2.1        digest_0.6.37           stringi_1.8.7          
## [40] labeling_0.4.3          splines_4.3.2           extrafont_0.19         
## [43] fastmap_1.2.0           grid_4.3.2              cli_3.6.5              
## [46] magrittr_2.0.3          utf8_1.2.6              dichromat_2.0-0.1      
## [49] survival_3.8-3          withr_3.0.2             backports_1.5.0        
## [52] gdtools_0.4.2           timechange_0.3.0        rmarkdown_2.29         
## [55] ggsignif_0.6.4          hms_1.1.3               evaluate_1.0.4         
## [58] knitr_1.50              viridisLite_0.4.2       mgcv_1.9-1             
## [61] rlang_1.1.6             Rcpp_1.1.0              glue_1.8.0             
## [64] polynom_1.4-1           rstudioapi_0.17.1       jsonlite_2.0.0         
## [67] R6_2.6.1                systemfonts_1.2.3